# library
library(ape) # manual: https://cran.r-project.org/web/packages/ape/ape.pdf
library(nlme) # manual: https://cran.r-project.org/web/packages/nlme/nlme.pdf
library(MASS)
library(car)
library(devtools)
library(tidyverse)
library(broom)
library(ggplot2)
library(ggpubr)
piS without log transformation. Kendall is used
piS with log transformation - Linearity improves. Kendall is used
The observed trend above indicates a more efficient action of natural selection on species with higher Ne. The effect seems stronger between log10(piS) and omegaNA. To infer the significance of these correlations we need to take into account the phylogenetic signal.
# load data with omegaA and omegaNA plus per species factors for the two datasets
spp_summ_gene <- read.table("./Data/1_Dataset/20spp_CatGene_rates+piS+SppRho.txt",sep="|", header=TRUE)
spp_summ_gene$Category_gene2 <- factor(spp_summ_gene$Data, levels = c("Non-secreted", "Secreted", "Effectors")) # change order to ease comparison in the model
spp_summ_gene$piS_log <- log10(spp_summ_gene$piS)
remove_spp <- c("V. dahliae", "R. commune", "P. teres f. teres","M. oryzae rice", "F. graminearum", "C. parasitica")
spp_summ_gene <- spp_summ_gene[!spp_summ_gene$Species %in% remove_spp, ]
# Load tree
spp_tree_raw <- read.tree(file = "./Data/0_Taxonomy/14spp_phyliptree.phy", text = NULL, tree.names = NULL, skip = 0, comment.char = "", keep.multi = FALSE)
spp_tree_thinned_omegaA_raw <- read.tree(file = "./Data/0_Taxonomy/6spp_phyliptreeNamed.phy", text = NULL, tree.names = NULL, skip = 0, comment.char = "", keep.multi = FALSE)
spp_tree_thinned_omegaNA_raw <- read.tree(file = "./Data/0_Taxonomy/7_spp_omegaNA_rec.phy", text = NULL, tree.names = NULL, skip = 0, comment.char = "", keep.multi = FALSE)
# match label
rename_tree <- function(tree_object) {
tree_object$tip.label <- gsub("'","",tree_object$tip.label)
tree_object$tip.label <- gsub("Parastagonospora nodorum","P. nodorum",tree_object$tip.label)
tree_object$tip.label <- gsub("Venturia inaequalis","V. inaequalis",tree_object$tip.label)
tree_object$tip.label <- gsub("Penicillium biforme","P. biforme",tree_object$tip.label)
tree_object$tip.label <- gsub("Ophiostoma novo-ulmi subsp. novo-ulmi","O. novo-ulmi subsp. novo-ulmi",tree_object$tip.label)
tree_object$tip.label <- gsub("Sphaerulina musiva","S. musiva",tree_object$tip.label)
tree_object$tip.label <- gsub("Zymoseptoria tritici","Z. tritici",tree_object$tip.label)
tree_object$tip.label <- gsub("Ophiostoma novo-ulmi subsp. americana","O. novo-ulmi subsp. americana",tree_object$tip.label)
tree_object$tip.label <- gsub("Ophiostoma ulmi","O. ulmi",tree_object$tip.label)
tree_object$tip.label <- gsub("Cercospora beticola","C. beticola",tree_object$tip.label)
tree_object$tip.label <- gsub("Botrytis cinerea","B. cinera",tree_object$tip.label)
tree_object$tip.label <- gsub("Sclerotinia sclerotiorum","S. sclerotiorum",tree_object$tip.label)
tree_object$tip.label <- gsub("Pyricularia oryzae 4091-5-8","M. oryzae Triticum",tree_object$tip.label)
tree_object$tip.label <- gsub("Neurospora discreta","N. discreta",tree_object$tip.label)
tree_object$tip.label <- gsub("Aspergillus flavus","A. flavus",tree_object$tip.label)
tree_object$tip.label <- gsub("Fusarium graminearum","F. graminearum",tree_object$tip.label)
tree_object$tip.label <- gsub("Cryphonectria parasitica","C. parasitica",tree_object$tip.label)
tree_object$tip.label <- gsub("Pyricularia oryzae 70-15","M. oryzae rice",tree_object$tip.label)
tree_object$tip.label <- gsub("Pyrenophora teres f. teres","P. teres f. teres",tree_object$tip.label)
tree_object$tip.label <- gsub("Rhynchosporium commune","R. commune",tree_object$tip.label)
tree_object$tip.label <- gsub("Verticillium dahliae","V. dahliae",tree_object$tip.label)
return(tree_object)
}
spp_tree <- rename_tree(spp_tree_raw)
#spp_summ_gene$Species %in% spp_tree$tip.label
#spp_summ_rec$Species %in% spp_tree$tip.label # add into the model..
# Function for convenience:
normtest <- function(model) {
return(shapiro.test(resid(model)))
}
indeptest <- function(model) {
return(Box.test(resid(model)[order(fitted(model))], type = "Ljung-Box"))
}
# check tree. This tree is simply taken from NCBI taxonomy
plot(spp_tree)
Looking at the plots
above it’s possible to see an effect of gene function, mostly effectors,
on omegaA and omegaNA for some species. This effect is however not
widespread
m_omegaA_gene <- gls(omegaA ~ Category_gene2 + piS_log, spp_summ_gene, correlation=corGrafen(1, spp_tree, form = ~Species))
hist(resid(m_omegaA_gene))
plot(resid(m_omegaA_gene)~fitted(m_omegaA_gene))
normtest(m_omegaA_gene)
##
## Shapiro-Wilk normality test
##
## data: resid(model)
## W = 0.99339, p-value = 0.997
indeptest(m_omegaA_gene)
##
## Box-Ljung test
##
## data: resid(model)[order(fitted(model))]
## X-squared = 1.8555, df = 1, p-value = 0.1731
summary(m_omegaA_gene)
## Generalized least squares fit by REML
## Model: omegaA ~ Category_gene2 + piS_log
## Data: spp_summ_gene
## AIC BIC logLik
## -91.20471 -81.37919 51.60235
##
## Correlation Structure: corGrafen
## Formula: ~Species
## Parameter estimate(s):
## rho
## 0.1364945
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 0.07582192 0.05333494 1.4216182 0.1633
## Category_gene2Secreted 0.02244116 0.01439549 1.5589023 0.1273
## Category_gene2Effectors 0.03641598 0.01439549 2.5296800 0.0157
## piS_log 0.00025630 0.02631897 0.0097381 0.9923
##
## Correlation:
## (Intr) Ctg_2S Ctg_2E
## Category_gene2Secreted 0.134
## Category_gene2Effectors -0.418 0.053
## piS_log 0.871 0.103 -0.103
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.02274749 -0.48854713 0.06739147 0.73310993 2.26165428
##
## Residual standard error: 0.05526628
## Degrees of freedom: 42 total; 38 residual
# calculate the VIF for each predictor variable in the model
vif(m_omegaA_gene)
## GVIF Df GVIF^(1/(2*Df))
## Category_gene2 1.022961 2 1.005691
## piS_log 1.022961 1 1.011415
# create vector of VIF values
vif_values <- vif(m_omegaA_gene)[,1]
# create horizontal bar chart to display each VIF value
barplot(vif_values, main = "GVIF", horiz = TRUE, col = "steelblue", xlim = c(0,6), cex.axis=1, cex.names=0.5) + abline(v = 5, lwd = 3, lty = 2)
## numeric(0)
summary(p1 <- powerTransform(omegaA ~ Category_gene2 + piS_log, spp_summ_gene, family = "bcnPower"))
spp_summ_gene$omegaA_trans <- bcnPower(spp_summ_gene$omegaA, lambda = p1$lambda, gamma = p1$gamma)
m_omegaA_gene.trans <- gls(omegaA_trans ~ Category_gene2 + piS_log, spp_summ_gene, correlation=corGrafen(1, spp_tree, form = ~Species))
hist(resid(m_omegaA_gene.trans))
plot(resid(m_omegaA_gene.trans)~fitted(m_omegaA_gene.trans))
normtest(m_omegaA_gene.trans)
##
## Shapiro-Wilk normality test
##
## data: resid(model)
## W = 0.98956, p-value = 0.9631
indeptest(m_omegaA_gene.trans)
##
## Box-Ljung test
##
## data: resid(model)[order(fitted(model))]
## X-squared = 1.4113, df = 1, p-value = 0.2348
m_omegaA_gene.trans <- gls(omegaA_trans ~ Category_gene2 + piS_log, spp_summ_gene, correlation=corGrafen(1, spp_tree, form = ~Species))
summary(m_omegaA_gene.trans)
## Generalized least squares fit by REML
## Model: omegaA_trans ~ Category_gene2 + piS_log
## Data: spp_summ_gene
## AIC BIC logLik
## 11.90051 21.72602 0.04974725
##
## Correlation Structure: corGrafen
## Formula: ~Species
## Parameter estimate(s):
## rho
## 0.1081643
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) -1.8046415 0.20364341 -8.861772 0.0000
## Category_gene2Secreted 0.0875076 0.05634282 1.553129 0.1287
## Category_gene2Effectors 0.1333529 0.05634282 2.366812 0.0231
## piS_log 0.0145549 0.10258236 0.141885 0.8879
##
## Correlation:
## (Intr) Ctg_2S Ctg_2E
## Category_gene2Secreted 0.119
## Category_gene2Effectors -0.406 0.035
## piS_log 0.875 0.088 -0.088
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.11105089 -0.48653192 0.03925545 0.77322652 1.99152341
##
## Residual standard error: 0.2145121
## Degrees of freedom: 42 total; 38 residual
m_omegaNA_gene <- gls(omegaNA ~ Category_gene2 + piS_log, spp_summ_gene, correlation=corGrafen(1, spp_tree, form = ~Species))
hist(resid(m_omegaNA_gene))
plot(resid(m_omegaNA_gene)~fitted(m_omegaNA_gene))
normtest(m_omegaNA_gene)
##
## Shapiro-Wilk normality test
##
## data: resid(model)
## W = 0.94553, p-value = 0.04474
indeptest(m_omegaNA_gene)
##
## Box-Ljung test
##
## data: resid(model)[order(fitted(model))]
## X-squared = 3.2817, df = 1, p-value = 0.07006
summary(m_omegaNA_gene)
## Generalized least squares fit by REML
## Model: omegaNA ~ Category_gene2 + piS_log
## Data: spp_summ_gene
## AIC BIC logLik
## -77.0674 -67.24188 44.5337
##
## Correlation Structure: corGrafen
## Formula: ~Species
## Parameter estimate(s):
## rho
## 0.1890623
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 0.14117219 0.06581329 2.1450406 0.0384
## Category_gene2Secreted 0.00896762 0.01705849 0.5256984 0.6022
## Category_gene2Effectors 0.03996095 0.01705849 2.3425851 0.0245
## piS_log 0.02109889 0.03146361 0.6705806 0.5065
##
## Correlation:
## (Intr) Ctg_2S Ctg_2E
## Category_gene2Secreted 0.147
## Category_gene2Effectors -0.429 0.089
## piS_log 0.866 0.124 -0.124
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.0288494 -0.9425169 -0.6620065 0.3798331 2.5824111
##
## Residual standard error: 0.0665931
## Degrees of freedom: 42 total; 38 residual
# calculate the VIF for each predictor variable in the model
vif(m_omegaNA_gene)
## GVIF Df GVIF^(1/(2*Df))
## Category_gene2 1.034657 2 1.008554
## piS_log 1.034657 1 1.017181
# create vector of VIF values
vif_values <- vif(m_omegaNA_gene)[,1]
# create horizontal bar chart to display each VIF value
barplot(vif_values, main = "GVIF", horiz = TRUE, col = "steelblue", xlim = c(0,6), cex.axis=1, cex.names=0.5) + abline(v = 5, lwd = 3, lty = 2)
## numeric(0)
summary(p1 <- powerTransform(omegaNA ~ Category_gene2 + piS_log, spp_summ_gene, family = "bcnPower"))
spp_summ_gene$omegaNA_trans <- bcnPower(spp_summ_gene$omegaNA, lambda = p1$lambda, gamma = p1$gamma)
m_omegaNA_gene.trans <- gls(omegaNA_trans ~ Category_gene2 + piS_log, spp_summ_gene, correlation=corGrafen(1, spp_tree, form = ~Species))
hist(resid(m_omegaNA_gene.trans))
plot(resid(m_omegaNA_gene.trans)~fitted(m_omegaNA_gene.trans))
normtest(m_omegaNA_gene.trans)
##
## Shapiro-Wilk normality test
##
## data: resid(model)
## W = 0.96677, p-value = 0.2563
indeptest(m_omegaNA_gene.trans)
##
## Box-Ljung test
##
## data: resid(model)[order(fitted(model))]
## X-squared = 5.0832, df = 1, p-value = 0.02416
m_omegaNA_gene.trans <- gls(omegaNA_trans ~ Category_gene2 + piS_log, spp_summ_gene, correlation=corGrafen(1, spp_tree, form = ~Species))
summary(m_omegaNA_gene.trans)
## Generalized least squares fit by REML
## Model: omegaNA_trans ~ Category_gene2 + piS_log
## Data: spp_summ_gene
## AIC BIC logLik
## 119.9635 129.789 -53.98176
##
## Correlation Structure: corGrafen
## Formula: ~Species
## Parameter estimate(s):
## rho
## 0.223764
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) -2.9777675 0.8893300 -3.348327 0.0018
## Category_gene2Secreted 0.1396496 0.2254763 0.619354 0.5394
## Category_gene2Effectors 0.5616736 0.2254763 2.491054 0.0172
## piS_log 0.3881061 0.4181003 0.928261 0.3591
##
## Correlation:
## (Intr) Ctg_2S Ctg_2E
## Category_gene2Secreted 0.149
## Category_gene2Effectors -0.431 0.113
## piS_log 0.864 0.132 -0.132
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.8403201 -1.0707549 -0.7223019 0.6516418 1.9886622
##
## Residual standard error: 0.8899668
## Degrees of freedom: 42 total; 38 residual
spp_summ_gene$Category_gene2 <- factor(spp_summ_gene$Data, levels = c("Secreted", "Non-secreted", "Effectors")) # change order to ease comparison in the model
summary(p1 <- powerTransform(omegaA ~ Category_gene2 + piS_log, spp_summ_gene, family = "bcnPower"))
spp_summ_gene$omegaA_trans <- bcnPower(spp_summ_gene$omegaA, lambda = p1$lambda, gamma = p1$gamma)
m_omegaA_gene.trans <- gls(omegaA_trans ~ Category_gene2 + piS_log, spp_summ_gene, correlation=corGrafen(1, spp_tree, form = ~Species))
hist(resid(m_omegaA_gene.trans))
plot(resid(m_omegaA_gene.trans)~fitted(m_omegaA_gene.trans))
normtest(m_omegaA_gene.trans)
##
## Shapiro-Wilk normality test
##
## data: resid(model)
## W = 0.98956, p-value = 0.9631
indeptest(m_omegaA_gene.trans)
##
## Box-Ljung test
##
## data: resid(model)[order(fitted(model))]
## X-squared = 1.4113, df = 1, p-value = 0.2348
m_omegaA_gene.trans <- gls(omegaA_trans ~ Category_gene2 + piS_log, spp_summ_gene, correlation=corGrafen(1, spp_tree, form = ~Species))
summary(m_omegaA_gene.trans)
## Generalized least squares fit by REML
## Model: omegaA_trans ~ Category_gene2 + piS_log
## Data: spp_summ_gene
## AIC BIC logLik
## 11.90051 21.72602 0.04974727
##
## Correlation Structure: corGrafen
## Formula: ~Species
## Parameter estimate(s):
## rho
## 0.1081643
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) -1.7171339 0.21768180 -7.888275 0.0000
## Category_gene2Non-secreted -0.0875076 0.05634282 -1.553129 0.1287
## Category_gene2Effectors 0.0458452 0.07825866 0.585816 0.5615
## piS_log 0.0145549 0.10258236 0.141885 0.8879
##
## Correlation:
## (Intr) Ct_2N- Ctg_2E
## Category_gene2Non-secreted -0.371
## Category_gene2Effectors -0.534 0.694
## piS_log 0.842 -0.088 -0.126
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.11105089 -0.48653192 0.03925545 0.77322652 1.99152341
##
## Residual standard error: 0.2145121
## Degrees of freedom: 42 total; 38 residual
m_omegaNA_gene <- gls(omegaNA ~ Category_gene2 + piS_log, spp_summ_gene, correlation=corGrafen(1, spp_tree, form = ~Species))
hist(resid(m_omegaNA_gene))
plot(resid(m_omegaNA_gene)~fitted(m_omegaNA_gene))
normtest(m_omegaNA_gene)
##
## Shapiro-Wilk normality test
##
## data: resid(model)
## W = 0.94553, p-value = 0.04474
indeptest(m_omegaNA_gene)
##
## Box-Ljung test
##
## data: resid(model)[order(fitted(model))]
## X-squared = 3.2817, df = 1, p-value = 0.07006
summary(m_omegaNA_gene)
## Generalized least squares fit by REML
## Model: omegaNA ~ Category_gene2 + piS_log
## Data: spp_summ_gene
## AIC BIC logLik
## -77.0674 -67.24188 44.5337
##
## Correlation Structure: corGrafen
## Formula: ~Species
## Parameter estimate(s):
## rho
## 0.1890623
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 0.15013980 0.07037398 2.1334561 0.0394
## Category_gene2Non-secreted -0.00896762 0.01705849 -0.5256984 0.6022
## Category_gene2Effectors 0.03099333 0.02303145 1.3456964 0.1864
## piS_log 0.02109889 0.03146361 0.6705806 0.5065
##
## Correlation:
## (Intr) Ct_2N- Ctg_2E
## Category_gene2Non-secreted -0.380
## Category_gene2Effectors -0.563 0.675
## piS_log 0.840 -0.124 -0.183
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.0288494 -0.9425169 -0.6620065 0.3798331 2.5824111
##
## Residual standard error: 0.0665931
## Degrees of freedom: 42 total; 38 residual